Imports & Dataset



In [9]:

    
import math
import numpy



In [10]:

    
dataset = [['dugg', 'clare', 'will', 'donald', 'deril', 'gregory', 'julia'],
           ['M', 'F', 'M', 'M', 'M', 'M', 'F'],
           ['20-30', '20-30', '20-30', '20-30', '30-40', '20-30', '5-10'],
           ['1.60-1.70', '1.70-1.80', '1.70-1.80', '1.80-1.90', '1.70-1.80', '>1.90', '<1.60']]



In [11]:

    
labels = ['no', 'no', 'yes', 'yes', 'no', 'no', 'yes']

Helper Functions - Decision Trees



In [12]:

    
def calc_entropy(labels):
    entropy = 0.0
    for i in set(labels):
        q_labels = float(labels.count(i))/len(labels)
        entropy += q_labels * math.log(1/q_labels)
    return entropy



In [13]:

    
def calc_split_entropy(splitted_labels):
    all_labels = [i for sublist in splitted_labels for i in sublist]
    new_entropy = 0.0
    for i in splitted_labels:
        q_split = float(len(i))/len(all_labels)
        new_entropy += q_split * calc_entropy(i)
    return new_entropy



In [14]:

    
def split_by(feature_index, dataset, labels):
    splitted_labels = {}
    for i, v in enumerate(dataset[feature_index]):
        if not splitted_labels.has_key(v):
            splitted_labels[v] = []
        splitted_labels[v].append(labels[i])
    return splitted_labels



In [15]:

    
def calc_variance(values):
    return (len(set(values))/float(len(values)))

Code



In [25]:

    
calc_split_entropy(split_by(3, dataset, labels).values())









    Out[25]:





0.2727917864120626



In [17]:

    
calc_entropy(labels)









    Out[17]:





0.6829081047004716



In [24]:

    
calc_variance(dataset[3])









    Out[24]:





0.7142857142857143



In [ ]: